{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## LangSmith 评估你的RAG系统\n",
    "\n",
    "使用LangSmith平台替代手动的RAG系统评估。\n",
    "\n",
    "1. 管理数据集\n",
    "2. 评估准确性\n",
    "3. 评估延迟\n",
    "4. 可视化评估效果"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1.Environment  \n",
    "需要提前准备的第三方package"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "! pip install -U -q langchain  tiktoken unstructured==0.12.5 openai pandas langchain-community chromadb langchain-openai"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Set up \n",
    "\n",
    "我们将使用到Embedding model,Rerank model,Chat model,LangChain\n",
    "1. Embedding model(OpenAI key)\n",
    "2. Rerank model(Cohere key)\n",
    "3. Chat model(OpenAI key)\n",
    "4. Langchain API "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "import getpass\n",
    "import os\n",
    "\n",
    "# Set your OpenAI API key\n",
    "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass()\n",
    "# Set your Langchain APi key \n",
    "os.environ[\"LANGCHAIN_API_KEY\"] = getpass.getpass()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3.Load Data\n",
    "加载arxiv论文，以[RAGAS](https://arxiv.org/pdf/2309.15217)论文为例。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from langchain.document_loaders import ArxivLoader\n",
    "\n",
    "paper_docs = ArxivLoader(query=\"2309.15217\", load_max_docs=1).load()\n",
    "len(paper_docs)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4.Prepare Data\n",
    "\n",
    "- text splitter\n",
    "- embedding\n",
    "- store "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.vectorstores import Chroma\n",
    "from langchain.embeddings import OpenAIEmbeddings\n",
    "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
    "\n",
    "##split\n",
    "text_splitter = RecursiveCharacterTextSplitter(chunk_size=500)\n",
    "docs = text_splitter.split_documents(paper_docs)\n",
    "\n",
    "##embed\n",
    "vectorstore = Chroma.from_documents(docs[:10], OpenAIEmbeddings())\n",
    "\n",
    "## Index\n",
    "retriever = vectorstore.as_retriever()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5.Build RAG Chain"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "### RAG\n",
    "\n",
    "import openai\n",
    "from langsmith import traceable\n",
    "from langsmith.wrappers import wrap_openai\n",
    "\n",
    "class RagBot:\n",
    "    \n",
    "    def __init__(self, retriever, model: str = \"gpt-4o-2024-05-13\"):\n",
    "        self._retriever = retriever\n",
    "        # Wrapping the client instruments the LLM\n",
    "        self._client = wrap_openai(openai.Client())\n",
    "        self._model = model\n",
    "\n",
    "    @traceable()\n",
    "    def retrieve_docs(self, question):\n",
    "        return self._retriever.invoke(question)\n",
    "\n",
    "    @traceable()\n",
    "    def get_answer(self, question: str):\n",
    "        similar = self.retrieve_docs(question)\n",
    "        response = self._client.chat.completions.create(\n",
    "            model=self._model,\n",
    "            messages=[\n",
    "                {\n",
    "                    \"role\": \"system\",\n",
    "                    \"content\": \"You are an expert language model designed to answer questions about academic papers in the fields of computer science, physics, mathematics, and statistics, among others, as indexed on arXiv.\"\n",
    "                    \" Use the following docs to produce accurate answers to the user question.\\n\\n\"\n",
    "                    f\"## Docs\\n\\n{similar}\",\n",
    "                },\n",
    "                {\"role\": \"user\", \"content\": question},\n",
    "            ],\n",
    "        )\n",
    "\n",
    "        # Evaluators will expect \"answer\" and \"contexts\"\n",
    "        return {\n",
    "            \"answer\": response.choices[0].message.content,\n",
    "            \"contexts\": [str(doc) for doc in similar],\n",
    "        }\n",
    "\n",
    "\n",
    "rag_bot = RagBot(retriever)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'RAGAS (Retrieval Augmented Generation Assessment) is a framework designed for the reference-free evaluation of Retrieval Augmented Generation (RAG) pi'"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "response = rag_bot.get_answer(\"What is ragas?\")\n",
    "response[\"answer\"][:150]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6.Load Grund-Truth Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "os.environ['LANGCHAIN_TRACING_V2'] = 'true'\n",
    "os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "# 加载JSON文件\n",
    "with open('ragas_qa.json', 'r', encoding='utf-8') as f:\n",
    "    data = json.load(f)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "inputs = []\n",
    "outputs = []\n",
    "\n",
    "for row in data:\n",
    "  question = row['question']\n",
    "  answer = row['answer']\n",
    "  inputs.append(question)\n",
    "  outputs.append(answer)\n",
    "\n",
    "qa_pairs = [{\"question\": q, \"answer\": a} for q, a in zip(inputs, outputs)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "ename": "HTTPError",
     "evalue": "[Errno 409 Client Error: Conflict for url: https://api.smith.langchain.com/datasets] {\"detail\":\"Dataset with this name already exists.\"}",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mHTTPError\u001b[0m                                 Traceback (most recent call last)",
      "File \u001b[1;32mc:\\Users\\blackink\\.conda\\envs\\llangchainhf\\lib\\site-packages\\langsmith\\utils.py:111\u001b[0m, in \u001b[0;36mraise_for_status_with_text\u001b[1;34m(response)\u001b[0m\n\u001b[0;32m    110\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m--> 111\u001b[0m     \u001b[43mresponse\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mraise_for_status\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    112\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m requests\u001b[38;5;241m.\u001b[39mHTTPError \u001b[38;5;28;01mas\u001b[39;00m e:\n",
      "File \u001b[1;32mc:\\Users\\blackink\\.conda\\envs\\llangchainhf\\lib\\site-packages\\requests\\models.py:1021\u001b[0m, in \u001b[0;36mResponse.raise_for_status\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m   1020\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m http_error_msg:\n\u001b[1;32m-> 1021\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m HTTPError(http_error_msg, response\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m)\n",
      "\u001b[1;31mHTTPError\u001b[0m: 409 Client Error: Conflict for url: https://api.smith.langchain.com/datasets",
      "\nThe above exception was the direct cause of the following exception:\n",
      "\u001b[1;31mHTTPError\u001b[0m                                 Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[33], line 6\u001b[0m\n\u001b[0;32m      4\u001b[0m client \u001b[38;5;241m=\u001b[39m Client()\n\u001b[0;32m      5\u001b[0m dataset_name \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrags-test2\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m----> 6\u001b[0m dataset \u001b[38;5;241m=\u001b[39m \u001b[43mclient\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcreate_dataset\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m      7\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdataset_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdataset_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m      8\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdescription\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mQA pairs about ragas2.\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m      9\u001b[0m \u001b[43m)\u001b[49m\n\u001b[0;32m     10\u001b[0m client\u001b[38;5;241m.\u001b[39mcreate_examples(\n\u001b[0;32m     11\u001b[0m     inputs\u001b[38;5;241m=\u001b[39m[{\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mquestion\u001b[39m\u001b[38;5;124m\"\u001b[39m: q} \u001b[38;5;28;01mfor\u001b[39;00m q \u001b[38;5;129;01min\u001b[39;00m inputs],\n\u001b[0;32m     12\u001b[0m     outputs\u001b[38;5;241m=\u001b[39m[{\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124manswer\u001b[39m\u001b[38;5;124m\"\u001b[39m: a} \u001b[38;5;28;01mfor\u001b[39;00m a \u001b[38;5;129;01min\u001b[39;00m outputs],\n\u001b[0;32m     13\u001b[0m     dataset_id\u001b[38;5;241m=\u001b[39mdataset\u001b[38;5;241m.\u001b[39mid,\n\u001b[0;32m     14\u001b[0m )\n",
      "File \u001b[1;32mc:\\Users\\blackink\\.conda\\envs\\llangchainhf\\lib\\site-packages\\langsmith\\client.py:2335\u001b[0m, in \u001b[0;36mClient.create_dataset\u001b[1;34m(self, dataset_name, description, data_type)\u001b[0m\n\u001b[0;32m   2325\u001b[0m dataset \u001b[38;5;241m=\u001b[39m ls_schemas\u001b[38;5;241m.\u001b[39mDatasetCreate(\n\u001b[0;32m   2326\u001b[0m     name\u001b[38;5;241m=\u001b[39mdataset_name,\n\u001b[0;32m   2327\u001b[0m     description\u001b[38;5;241m=\u001b[39mdescription,\n\u001b[0;32m   2328\u001b[0m     data_type\u001b[38;5;241m=\u001b[39mdata_type,\n\u001b[0;32m   2329\u001b[0m )\n\u001b[0;32m   2330\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msession\u001b[38;5;241m.\u001b[39mpost(\n\u001b[0;32m   2331\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mapi_url \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/datasets\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m   2332\u001b[0m     headers\u001b[38;5;241m=\u001b[39m{\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_headers, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mContent-Type\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mapplication/json\u001b[39m\u001b[38;5;124m\"\u001b[39m},\n\u001b[0;32m   2333\u001b[0m     data\u001b[38;5;241m=\u001b[39mdataset\u001b[38;5;241m.\u001b[39mjson(),\n\u001b[0;32m   2334\u001b[0m )\n\u001b[1;32m-> 2335\u001b[0m \u001b[43mls_utils\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mraise_for_status_with_text\u001b[49m\u001b[43m(\u001b[49m\u001b[43mresponse\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   2336\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ls_schemas\u001b[38;5;241m.\u001b[39mDataset(\n\u001b[0;32m   2337\u001b[0m     \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mresponse\u001b[38;5;241m.\u001b[39mjson(),\n\u001b[0;32m   2338\u001b[0m     _host_url\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_host_url,\n\u001b[0;32m   2339\u001b[0m     _tenant_id\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_get_optional_tenant_id(),\n\u001b[0;32m   2340\u001b[0m )\n",
      "File \u001b[1;32mc:\\Users\\blackink\\.conda\\envs\\llangchainhf\\lib\\site-packages\\langsmith\\utils.py:113\u001b[0m, in \u001b[0;36mraise_for_status_with_text\u001b[1;34m(response)\u001b[0m\n\u001b[0;32m    111\u001b[0m     response\u001b[38;5;241m.\u001b[39mraise_for_status()\n\u001b[0;32m    112\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m requests\u001b[38;5;241m.\u001b[39mHTTPError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m--> 113\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m requests\u001b[38;5;241m.\u001b[39mHTTPError(\u001b[38;5;28mstr\u001b[39m(e), response\u001b[38;5;241m.\u001b[39mtext) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01me\u001b[39;00m\n",
      "\u001b[1;31mHTTPError\u001b[0m: [Errno 409 Client Error: Conflict for url: https://api.smith.langchain.com/datasets] {\"detail\":\"Dataset with this name already exists.\"}"
     ]
    }
   ],
   "source": [
    "from langsmith import Client\n",
    "\n",
    "# Create dataset\n",
    "client = Client()\n",
    "dataset_name = \"rags-test2\"\n",
    "dataset = client.create_dataset(\n",
    "    dataset_name=dataset_name,\n",
    "    description=\"QA pairs about ragas2.\",\n",
    ")\n",
    "client.create_examples(\n",
    "    inputs=[{\"question\": q} for q in inputs],\n",
    "    outputs=[{\"answer\": a} for a in outputs],\n",
    "    dataset_id=dataset.id,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 7.Evaluate RAG "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "# RAG chain\n",
    "def predict_rag_answer(example: dict):\n",
    "    \"\"\"Use this for answer evaluation\"\"\"\n",
    "    response = rag_bot.get_answer(example[\"question\"])\n",
    "    return {\"answer\": response[\"answer\"]}\n",
    "\n",
    "def predict_rag_answer_with_context(example: dict):\n",
    "    \"\"\"Use this for evaluation of retrieved documents and hallucinations\"\"\"\n",
    "    response = rag_bot.get_answer(example[\"question\"])\n",
    "    return {\"answer\": response[\"answer\"], \"contexts\": response[\"contexts\"]}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from langsmith.evaluation import LangChainStringEvaluator, evaluate\n",
    "\n",
    "# Evaluator\n",
    "qa_evalulator = [\n",
    "    LangChainStringEvaluator(\n",
    "        \"qa\",\n",
    "        prepare_data=lambda run, example: {\n",
    "            \"prediction\": run.outputs[\"answer\"],\n",
    "            \"reference\": example.outputs[\"answer\"],\n",
    "            \"input\": example.inputs[\"question\"],\n",
    "        },\n",
    "      ),\n",
    "]\n",
    "experiment_results = evaluate(\n",
    "    predict_rag_answer,\n",
    "    data=dataset_name,\n",
    "    evaluators=qa_evalulator,\n",
    "    experiment_prefix=\"ragas-paper-qa-gp4o\",\n",
    "    metadata={\"variant\": \"ragas-paper-page2, gpt-4o-2024-05-13\"},\n",
    ")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "pytorch",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.18"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
